# This script uses NO empirical data
# simply calculates and visualizes the mathematically possible
# variance among URLs that average to a given domain score

import numpy as np
import itertools as it

import cmocean as cmo
import matplotlib.pyplot as plt

def calc_scores(shares):
    # get all possible scores given shares shares of a URL
    
    print('  Calculating Possible URL Scores...')
    scores = np.zeros(shares+1)

    for i in range(shares+1):
        R = i
        D = shares - i

        score = (R - D) / (R + D)
        scores[i] = score

    return scores


def get_domain_scores(scores, urls):
    print('  Calculating possible domain scores...')
    
    # dict to count how many times each domain score occurs
    domain_scores = dict()
    
    # dict to track url variance for each domain score
    url_vars = dict()

    # iterate through all urls-length combinations of possible scores
    for url_scores in it.combinations_with_replacement(scores, urls):
        
        # calc domain score as average if url scores
        mean = np.round(np.mean(url_scores), 2)
        domain_scores.setdefault(mean, 0)
        domain_scores[mean] += 1

        # get variance
        var = np.var(url_scores)
           
        url_vars.setdefault(mean, list())
        url_vars[mean].append(var)

    return domain_scores, url_vars


if __name__ == '__main__':
    shares = 15
    urls = 15
    
    print(f'Calculating possible URL-level variance for domains with {urls} urls, where each URL recieves {shares} shares.')
    
    scores = calc_scores(shares)
    domain_scores, url_vars = get_domain_scores(scores, urls)
    
    # visualize URL variance for each domain
    print('Visualizing...')
    binned = dict()
    
    for i, (mean_score, url_var) in enumerate(url_vars.items()):
        
        # round variance for visualization
        var_short = [np.round(val, 3) for val in url_var]

        binned.setdefault(mean_score, dict())

        for var in set(var_short):
            binned[mean_score].setdefault(var, 0)
            binned[mean_score][var] += var_short.count(var)
            
    x = list() # domain score
    y = list() # variances
    size = list() # count of times variance occurs

    for x_i, vals in binned.items():
        for y_i, count in vals.items():
            x.append(x_i)
            y.append(y_i)
            size.append(count)

    fig, ax = plt.subplots(figsize=(10, 6))

    cmap = cmo.cm.amp
    dark = cmap(.8)

    plt.scatter(x, y, 
                s=np.array(size)*.001,
                rasterized=True,
                color=dark)
    
    ax.set_xlabel('Partisanship Score', size=16)
    ax.set_ylabel('URL Variance', size=16)
    ax.set_title('Potential URL Variance by Domain Score', size=18)
    ax.tick_params(axis='x', labelsize=14)
    ax.tick_params(axis='y', labelsize=14)
    plt.savefig('fig_a1.pdf',
                bbox_inches='tight',dpi=300)
    
    print('Figure saved to file.')
